MaxPoolGrad

描述 MaxPool 的反向传播（梯度）计算。该算子将上游梯度（dy）只回传到前向最大池化过程中被选为最大值的位置；其它位置的梯度为 0。

数学定义:

\[\begin{split}\text{output}_{b,\ h_i,\ w_i,\ c} = \begin{cases} \text{dy}_{b,\ h_o,\ w_o,\ c}, & \text{if } (h_i,\ w_i) = \displaystyle \arg\max_{(h,w)\in\mathcal{W}(h_o,w_o)} \text{input}_{b,\ h,\ w,\ c}, \\ 0, & \text{otherwise}. \end{cases}\end{split}\]

其中，\(\mathcal{W}(h_o, w_o)\) 表示输出位置 \((h_o, w_o)\) 对应的池化窗口区域。窗口像素位置 \((h, w)\) 可表示为：

\[h = h_o \cdot \text{stride}_h - \text{pad}_u + \Delta h\]

\[w = w_o \cdot \text{stride}_w - \text{pad}_l + \Delta w\]

\[\Delta h \in [0,\ \text{win}_h - 1], \qquad \Delta w \in [0,\ \text{win}_w - 1]\]

并且仅当采样点落在输入有效范围内时会被考虑：

\[0 \le h < \text{in}_h, \qquad 0 \le w < \text{in}_w.\]

实现细节说明:

前向池化使用窗口 \(\text{win}_h \times \text{win}_w\)，步长为 \(\text{stride}_h\), \(\text{stride}_w\)，并且在边界处使用 pad（pad_u, pad_l）。

反向传播时，输出梯度 tensor（即需要写入的输入梯度）在每个 batch 开始前先被初始化为 0（代码中有一次整体清零）。

对于每个输出像素 \((h_o,w_o)\) 以及每个通道 c：

在对应的输入窗口中找到前向最大值的位置 \((h^*,w^*)\)；

将上游梯度 \(\text{dy}_{b,h_o,w_o,c}\) 累加到该位置：\(\text{output}_{b,h^*,w^*,c} \mathrel{+}= \text{dy}_{b,h_o,w_o,c}\)。

其他位置梯度保持 0。

输入:

input - 输入张量指针，采用 NHWC 格式，形状为 \([batch,\ in\_h,\ in\_w,\ channel]\)

dy - 上游梯度张量指针，采用 NHWC 格式，形状为 \([batch,\ output\_h,\ output\_w,\ channel]\)

params - 参数数组，包含所有输入参数，顺序如下：

in_w - 输入张量的宽度 (W)

in_h - 输入张量的高度 (H)

win_w - 池化窗口的宽度，即窗口在 W 方向的大小

win_h - 池化窗口的高度，即窗口在 H 方向的大小

output_w - 输出特征图的宽度

output_h - 输出特征图的高度

batch - 批次大小，即输入中的 batch 数

channel - 通道数 C ，每个池化位置都分别对 C 个通道独立执行最大池化与裁剪

stride_w - 池化窗口在 W 方向的步长

stride_h - 池化窗口在 H 方向的步长

pad_l - 输入特征图左侧的填充大小

pad_u - 输入特征图上侧的填充大小

minf - 输出结果的下界值，传指针

maxf - 输出结果的上界值，传指针

core_mask - 核心掩码，指定使用的计算核心

输出:

output - 输出张量指针，采用 NHWC 格式，形状为 \([batch,\ in\_h,\ in\_w,\ channel]\)。

支持平台：
FT78NE MT7004

备注

FT78NE 支持fp32, fp64

MT7004 支持fp16, fp32

调用时将除 core_mask 外的参数打包通过 long long params 数组传入，顺序为： input, dy, output, in_w, in_h, win_w, win_h, output_w, output_h, batch, channel, stride_w, stride_h, pad_l, pad_u, minf, maxf

共享存储版本:

void fp_maxpool_grad_s(float *input_ptr, float *dy_ptr, float *output_ptr, long long *params, int core_mask);

void hp_maxpool_grad_s(float16 *input_ptr, float16 *dy_ptr, float16 *output_ptr, long long *params, int core_mask);

C调用示例：

//FT78NE示例
#include <stdio.h>

int main(int argc, char* argv[]) {
    float *input_ptr = (float *)0x81000000;
    float *dy_ptr = (float *)0x82000000;
    float *output_ptr = (float *)0x83000000;
    float *check_ptr = (float *)0x84000000;

    int in_w = gin_w;
    int in_h = gin_h;
    int win_w = 6;
    int win_h = 6;
    int output_batch = gbatch; //batch数
    int channel = 1;
    int stride_w = 4;
    int stride_h = 4;
    int pad_l = 1;
    int pad_u = 1;
    float minf = 0;
    float maxf = 50;

    //计算output_w和output_h
    int dividor = in_w + pad_l*2 - win_w;
    int output_w = (dividor + stride_w - 1) / stride_w  + 1;
    int dividor2 = in_h + pad_u*2 - win_h;
    int output_h = (dividor2 + stride_h - 1) / stride_h  + 1;

    long long params[17];
    params[0] = (long long)in_w;
    params[1] = (long long)in_h;
    params[2] = (long long)win_w;
    params[3] = (long long)win_h;
    params[4] = (long long)output_w;
    params[5] = (long long)output_h;
    params[6] = (long long)output_batch;
    params[7] = (long long)channel;
    params[8] = (long long)stride_w;
    params[9] = (long long)stride_h;
    params[10] = (long long)pad_l;
    params[11] = (long long)pad_u;
    params[12] = (long long)&minf; //注意这里传指针，不能直接强制转换成long long
    params[13] = (long long)&maxf;

    srand(time(NULL));
    //初始化output_ptr
    int input_size = output_batch * channel * in_w * in_h;
    int dy_size = output_batch * channel * output_w * output_h;
    int i;
    for (i = 0; i < input_size; i++) {
        input_ptr[i] = (float)(rand() % 100);
    }
    for (i = 0; i < dy_size; i++) {
        dy_ptr[i] = (float)(rand() % 100);
    }
    int core_mask = 0b1111;
    fp_maxpool_grad_s(input_ptr, dy_ptr, output_ptr, params, core_mask);
    return 0;
}

私有存储版本:

void fp_maxpool_grad_p(float *input_ptr, float *dy_ptr, float *output_ptr, long long *params);

void hp_maxpool_grad_p(float16 *input_ptr, float16 *dy_ptr, float16 *output_ptr, long long *params);

C调用示例：

//FT78NE示例
#include <stdio.h>

int main(int argc, char* argv[]) {
    float *input_ptr = (float *)0x10010000;
    float *dy_ptr = (float *)0x10020000;
    float *output_ptr = (float *)0x10030000;
    float *check_ptr = (float *)0x10040000;

    int in_w = gin_w;
    int in_h = gin_h;
    int win_w = 6;
    int win_h = 6;
    int output_batch = gbatch; //batch数
    int channel = 1;
    int stride_w = 4;
    int stride_h = 4;
    int pad_l = 1;
    int pad_u = 1;
    float minf = 0;
    float maxf = 50;

    //计算output_w和output_h
    int dividor = in_w + pad_l*2 - win_w;
    int output_w = (dividor + stride_w - 1) / stride_w  + 1;
    int dividor2 = in_h + pad_u*2 - win_h;
    int output_h = (dividor2 + stride_h - 1) / stride_h  + 1;

    long long params[17];
    params[0] = (long long)in_w;
    params[1] = (long long)in_h;
    params[2] = (long long)win_w;
    params[3] = (long long)win_h;
    params[4] = (long long)output_w;
    params[5] = (long long)output_h;
    params[6] = (long long)output_batch;
    params[7] = (long long)channel;
    params[8] = (long long)stride_w;
    params[9] = (long long)stride_h;
    params[10] = (long long)pad_l;
    params[11] = (long long)pad_u;
    params[12] = (long long)&minf; //注意这里传指针，不能直接强制转换成long long
    params[13] = (long long)&maxf;

    srand(time(NULL));
    //初始化output_ptr
    int input_size = output_batch * channel * in_w * in_h;
    int dy_size = output_batch * channel * output_w * output_h;
    int i;
    for (i = 0; i < input_size; i++) {
        input_ptr[i] = (float)(rand() % 100);
    }
    for (i = 0; i < dy_size; i++) {
        dy_ptr[i] = (float)(rand() % 100);
    }

    fp_maxpool_grad_p(input_ptr, dy_ptr, output_ptr, params);
    return 0;
}